options(connectionObserver = NULL)
suppressPackageStartupMessages({
library(DESeq2)
library(edgeR)
library(here)
library(ggtree)
library(gplots)
library(hyperSpec)
library(LSD)
library(org.Hs.eg.db)
library(plotly)
library(pvclust)
library(RColorBrewer)
library(SummarizedExperiment)
library(tidyverse)
library(treeio)
library(tximeta)
library(vsn)
})
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
pal <- brewer.pal(8,"Dark2")
dex and cell will be variables in our model. As they are categorical, we change them to be factors
meta <- read.delim(here("data/airway/airway_meta.txt"),
header = TRUE, as.is = TRUE)
rownames(meta) <- meta$names
meta$dex <- factor(meta$dex)
meta$cell <- factor(meta$cell)
A look at it
meta
## SampleName cell dex albut names avgLength Experiment
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
## Sample BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
What is a factor?
meta$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: trt untrt
levels(meta$dex)
## [1] "trt" "untrt"
as.integer(meta$dex)
## [1] 2 1 2 1 2 1 2 1
The following code, we do not run, it is there to show how the featureCounts data was generated. featureCounts is a function part of the Rsubread library. It reads BAM files, generated by a “classical” aligner and summarises the overlap of the alignments with the gene annotation. I.e. for every gene locus, it counts the number of reads that aligned there. featureCounts can handle multi-mapping reads, but not in a probabilistic manner.
library(Rsubread)
fc <- featureCounts(files = files,
annot.ext = gtf,
isGTFAnnotationFile = TRUE,
GTF.featureType = "exon",
GTF.attrType = "gene_id",
useMetaFeatures = TRUE,
strandSpecific = 0,
isPairedEnd = TRUE,
nthreads = 6)
Read the data generated from the command above
fc <- readRDS(here("data/airway/featureCounts/star_featureCounts.rds"))
let us inspect it
names(fc)
## [1] "counts" "annotation" "targets" "stat"
counts_featurecounts <- fc$counts
head(counts_featurecounts)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000223972.5 29 28 23 16 21
## ENSG00000227232.5 843 701 1102 495 838
## ENSG00000278267.1 26 19 24 16 13
## ENSG00000243485.5 20 9 30 9 10
## ENSG00000284332.1 0 0 0 0 0
## ENSG00000237613.2 19 6 19 4 8
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000223972.5 30 5 2
## ENSG00000227232.5 967 578 573
## ENSG00000278267.1 20 12 16
## ENSG00000243485.5 4 9 3
## ENSG00000284332.1 0 0 0
## ENSG00000237613.2 6 14 7
dim(counts_featurecounts)
## [1] 58721 8
fc$stat
## Status SRR1039508 SRR1039509 SRR1039512 SRR1039513
## 1 Assigned 21007653 19184955 25895949 15496792
## 2 Unassigned_Unmapped 0 0 0 0
## 3 Unassigned_MappingQuality 0 0 0 0
## 4 Unassigned_Chimera 0 0 0 0
## 5 Unassigned_FragmentLength 0 0 0 0
## 6 Unassigned_Duplicate 0 0 0 0
## 7 Unassigned_MultiMapping 0 0 0 0
## 8 Unassigned_Secondary 0 0 0 0
## 9 Unassigned_NonSplit 0 0 0 0
## 10 Unassigned_NoFeatures 1922304 1410877 2030626 1083972
## 11 Unassigned_Overlapping_Length 0 0 0 0
## 12 Unassigned_Ambiguity 1358418 1236366 1584019 943994
## SRR1039516 SRR1039517 SRR1039520 SRR1039521
## 1 24998626 31477332 19500283 21608001
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 2133471 2509956 1539435 1791941
## 11 0 0 0 0
## 12 1556731 1927932 1186465 1283094
let us visualise it
stats <- as.matrix(fc$stat %>% column_to_rownames("Status"))
barplot(stats[rowSums(stats)>0,],
beside = TRUE,col = pal[1:3],las=2,
cex.axis = .8,cex.names = .8,
legend.text = rownames(stats)[rowSums(stats)>0],
args.legend = list(horiz=TRUE,cex=.8,x="top",bty="n"))
We list all quant.sf output files from Salmon
salmonfiles <- here("data/airway/salmon/",
meta$names, "/quant.sf")
names(salmonfiles) <- meta$names
stopifnot(all(file.exists(salmonfiles)))
We extend the metadata by adding a column “files” to the metadata table. This table must contain at least two columns: “names” and “files” for importing the data using the tximeta package
coldata <- cbind(meta, files = salmonfiles,
stringsAsFactors = FALSE)
How does it look
coldata
## SampleName cell dex albut names avgLength Experiment
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
## Sample BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
## files
## SRR1039508 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039508//quant.sf
## SRR1039509 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039509//quant.sf
## SRR1039512 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039512//quant.sf
## SRR1039513 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039513//quant.sf
## SRR1039516 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039516//quant.sf
## SRR1039517 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039517//quant.sf
## SRR1039520 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039520//quant.sf
## SRR1039521 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039521//quant.sf
Now, we can import the quantifications
Remember that Salmon quantifies abundances at the transcript level
This function will fetch from the Gencode archive (locate at the European Bioinformatics Institue), the annotation relevant to our project, including the transcript information and that of their parental gene
Note: enter 2 when prompted
st <- tximeta::tximeta(coldata)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2021-06-10 22:59:18
## Loading required package: GenomicFeatures
## loading existing transcript ranges created: 2021-06-10 22:59:21
let us take a look at it
st
## class: RangedSummarizedExperiment
## dim: 205870 8
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(205870): ENST00000456328.2 ENST00000450305.2 ...
## ENST00000387460.2 ENST00000387461.2
## rowData names(3): tx_id gene_id tx_name
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
metadata(st)
## $tximetaInfo
## $tximetaInfo$version
## [1] '1.8.4'
##
## $tximetaInfo$type
## [1] "salmon"
##
## $tximetaInfo$importTime
## [1] "2021-06-13 12:17:41 CEST"
##
##
## $quantInfo
## $quantInfo$salmon_version
## [1] "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0"
##
## $quantInfo$samp_type
## [1] "none" "none" "none" "none" "none" "none" "none" "none"
##
## $quantInfo$opt_type
## [1] "vb" "vb" "vb" "vb" "vb" "vb" "vb" "vb"
##
## $quantInfo$quant_errors
## $quantInfo$quant_errors[[1]]
## list()
##
## $quantInfo$quant_errors[[2]]
## list()
##
## $quantInfo$quant_errors[[3]]
## list()
##
## $quantInfo$quant_errors[[4]]
## list()
##
## $quantInfo$quant_errors[[5]]
## list()
##
## $quantInfo$quant_errors[[6]]
## list()
##
## $quantInfo$quant_errors[[7]]
## list()
##
## $quantInfo$quant_errors[[8]]
## list()
##
##
## $quantInfo$num_libraries
## [1] 1 1 1 1 1 1 1 1
##
## $quantInfo$library_types
## [1] "IU" "IU" "IU" "IU" "IU" "IU" "IU" "IU"
##
## $quantInfo$frag_dist_length
## [1] 1001 1001 1001 1001 1001 1001 1001 1001
##
## $quantInfo$seq_bias_correct
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $quantInfo$gc_bias_correct
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $quantInfo$num_bias_bins
## [1] 4096 4096 4096 4096 4096 4096 4096 4096
##
## $quantInfo$mapping_type
## [1] "mapping" "mapping" "mapping" "mapping" "mapping" "mapping" "mapping"
## [8] "mapping"
##
## $quantInfo$num_targets
## [1] 205870 205870 205870 205870 205870 205870 205870 205870
##
## $quantInfo$serialized_eq_classes
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##
## $quantInfo$length_classes
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 520 520 520 520 520 520 520 520
## [2,] 669 669 669 669 669 669 669 669
## [3,] 1065 1065 1065 1065 1065 1065 1065 1065
## [4,] 2328 2328 2328 2328 2328 2328 2328 2328
## [5,] 205012 205012 205012 205012 205012 205012 205012 205012
##
## $quantInfo$index_seq_hash
## [1] "40849ed828ea7d6a94af54a5c40e5d87eb0ce0fc1e9513208a5cffe59d442292"
##
## $quantInfo$index_name_hash
## [1] "77aca5545a0626421efb4730dd7b95482c77da261f9bdef70d36e25ee68bb7ef"
##
## $quantInfo$index_seq_hash512
## [1] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [2] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [3] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [4] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [5] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [6] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [7] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [8] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
##
## $quantInfo$index_name_hash512
## [1] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [2] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [3] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [4] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [5] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [6] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [7] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [8] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
##
## $quantInfo$num_bootstraps
## [1] 0 0 0 0 0 0 0 0
##
## $quantInfo$num_processed
## [1] 22935521 21155707 28136282 16823088 27298970 34298260 21275888 23487860
##
## $quantInfo$num_mapped
## [1] 21072103 19264813 26088219 15654204 25210669 31839924 19643454 21784047
##
## $quantInfo$percent_mapped
## [1] 91.87541 91.06201 92.72092 93.05191 92.35026 92.83248 92.32730 92.74598
##
## $quantInfo$call
## [1] "quant" "quant" "quant" "quant" "quant" "quant" "quant" "quant"
##
## $quantInfo$start_time
## [1] "Thu Mar 21 21:33:45 2019" "Thu Mar 21 21:35:50 2019"
## [3] "Thu Mar 21 21:37:52 2019" "Thu Mar 21 21:40:11 2019"
## [5] "Thu Mar 21 21:41:52 2019" "Thu Mar 21 21:44:04 2019"
## [7] "Thu Mar 21 21:46:38 2019" "Thu Mar 21 21:48:32 2019"
##
## $quantInfo$end_time
## [1] "Thu Mar 21 21:35:49 2019" "Thu Mar 21 21:37:51 2019"
## [3] "Thu Mar 21 21:40:10 2019" "Thu Mar 21 21:41:51 2019"
## [5] "Thu Mar 21 21:44:02 2019" "Thu Mar 21 21:46:37 2019"
## [7] "Thu Mar 21 21:48:31 2019" "Thu Mar 21 21:50:28 2019"
##
##
## $countsFromAbundance
## [1] "no"
##
## $level
## [1] "txp"
##
## $txomeInfo
## $txomeInfo$source
## [1] "GENCODE"
##
## $txomeInfo$organism
## [1] "Homo sapiens"
##
## $txomeInfo$release
## [1] "29"
##
## $txomeInfo$genome
## [1] "GRCh38"
##
## $txomeInfo$fasta
## [1] "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz"
##
## $txomeInfo$gtf
## [1] "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
##
## $txomeInfo$sha256
## [1] "40849ed828ea7d6a94af54a5c40e5d87eb0ce0fc1e9513208a5cffe59d442292"
##
## $txomeInfo$linkedTxome
## [1] FALSE
##
##
## $txdbInfo
## Db type
## "TxDb"
## Supporting package
## "GenomicFeatures"
## Data source
## "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
## Organism
## "Homo sapiens"
## Taxonomy ID
## "9606"
## miRBase build ID
## NA
## Genome
## "hg38"
## transcript_nrow
## "206694"
## exon_nrow
## "702834"
## cds_nrow
## "274013"
## Db created by
## "GenomicFeatures package from Bioconductor"
## Creation time
## "2019-10-07 10:00:06 -0400 (Mon, 07 Oct 2019)"
## GenomicFeatures version at creation time
## "1.36.4"
## RSQLite version at creation time
## "2.1.2"
## DBSCHEMAVERSION
## "1.2"
colData(st)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut names avgLength
## <character> <factor> <factor> <character> <character> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <character> <character> <character>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
rowData(st)
## DataFrame with 205870 rows and 3 columns
## tx_id gene_id tx_name
## <integer> <CharacterList> <character>
## ENST00000456328.2 1 ENSG00000223972.5 ENST00000456328.2
## ENST00000450305.2 2 ENSG00000223972.5 ENST00000450305.2
## ENST00000488147.1 9483 ENSG00000227232.5 ENST00000488147.1
## ENST00000619216.1 9484 ENSG00000278267.1 ENST00000619216.1
## ENST00000473358.1 3 ENSG00000243485.5 ENST00000473358.1
## ... ... ... ...
## ENST00000361681.2 206692 ENSG00000198695.2 ENST00000361681.2
## ENST00000387459.1 206693 ENSG00000210194.1 ENST00000387459.1
## ENST00000361789.2 206684 ENSG00000198727.2 ENST00000361789.2
## ENST00000387460.2 206685 ENSG00000210195.2 ENST00000387460.2
## ENST00000387461.2 206694 ENSG00000210196.2 ENST00000387461.2
assays(st) #plural
## List of length 3
## names(3): counts abundance length
head(assay(st)) # the first one by default
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENST00000456328.2 17.896 16.714 12.576 8.970 19.224
## ENST00000450305.2 0.000 0.000 0.000 0.000 0.000
## ENST00000488147.1 66.727 75.392 188.265 43.748 94.695
## ENST00000619216.1 0.000 0.000 0.000 0.000 0.000
## ENST00000473358.1 0.000 0.000 0.000 0.000 0.000
## ENST00000469289.1 0.000 0.000 0.000 0.000 0.000
## SRR1039517 SRR1039520 SRR1039521
## ENST00000456328.2 0.000 0.000 0.000
## ENST00000450305.2 0.000 0.000 0.000
## ENST00000488147.1 92.538 57.415 64.596
## ENST00000619216.1 0.000 0.000 1.000
## ENST00000473358.1 0.000 0.000 0.000
## ENST00000469289.1 0.000 1.059 0.000
However, we usually want to (start) analyse(ing) the data at the gene level
So we summarize the quantifications on the gene level.
sg <- tximeta::summarizeToGene(st)
## loading existing TxDb created: 2021-06-10 22:59:18
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2021-06-10 23:00:36
## summarizing abundance
## summarizing counts
## summarizing length
Did it work? how many transcripts?
dim(assay(st))
## [1] 205870 8
how many genes?
dim(assay(sg))
## [1] 58294 8
And, how does it look?
head(assay(sg))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 707.210 463.445 896.252 421.186 1185.870
## ENSG00000000005.5 0.000 0.000 0.000 0.000 0.000
## ENSG00000000419.12 454.000 508.999 606.000 352.999 583.000
## ENSG00000000457.13 311.880 266.807 358.017 217.296 354.140
## ENSG00000000460.16 91.451 74.463 52.361 45.144 107.779
## ENSG00000000938.12 0.000 0.000 2.000 0.000 1.000
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1086.289 802.000 596.220
## ENSG00000000005.5 0.000 0.000 0.000
## ENSG00000000419.12 773.000 410.000 500.001
## ENSG00000000457.13 423.712 297.742 288.368
## ENSG00000000460.16 100.610 97.742 84.634
## ENSG00000000938.12 0.000 0.000 0.000
Looks fine, but given that we have the gene annotation available, let us add gene symbols
sg <- tximeta::addIds(sg, "SYMBOL", gene = TRUE)
## mapping to new IDs using org.Hs.eg.db
## if all matching IDs are desired, and '1:many mappings' are reported,
## set multiVals='list' to obtain all the matching IDs
## 'select()' returned 1:many mapping between keys and columns
head(rowData(sg))
## DataFrame with 6 rows and 3 columns
## gene_id
## <character>
## ENSG00000000003.14 ENSG00000000003.14
## ENSG00000000005.5 ENSG00000000005.5
## ENSG00000000419.12 ENSG00000000419.12
## ENSG00000000457.13 ENSG00000000457.13
## ENSG00000000460.16 ENSG00000000460.16
## ENSG00000000938.12 ENSG00000000938.12
## tx_ids
## <CharacterList>
## ENSG00000000003.14 ENST00000612152.4,ENST00000373020.8,ENST00000614008.4,...
## ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1
## ENSG00000000419.12 ENST00000371588.9,ENST00000466152.5,ENST00000371582.8,...
## ENSG00000000457.13 ENST00000367771.10,ENST00000367770.5,ENST00000367772.8,...
## ENSG00000000460.16 ENST00000498289.5,ENST00000472795.5,ENST00000496973.5,...
## ENSG00000000938.12 ENST00000374005.7,ENST00000399173.5,ENST00000374004.5,...
## SYMBOL
## <character>
## ENSG00000000003.14 TSPAN6
## ENSG00000000005.5 TNMD
## ENSG00000000419.12 DPM1
## ENSG00000000457.13 SCYL3
## ENSG00000000460.16 C1orf112
## ENSG00000000938.12 FGR
Because the tools (DESeq2 and edgeR), we use downstream expect integers (non decimal counts), we round the values. This has no impact on the analysis
counts_salmon <- round(assay(sg, "counts"))
Let’s compare featureCounts vs. Salmon for one sample
spl <- "SRR1039508"
for comparing, we need to make sure, both counts table are sorted in the same way. We create a data.frame containing both sorted according to their gene IDs
gns <- rownames(counts_salmon)
quants <- data.frame(featureCounts = counts_featurecounts[gns, spl],
salmon = counts_salmon[gns, spl])
head(quants)
## featureCounts salmon
## ENSG00000000003.14 702 707
## ENSG00000000005.5 0 0
## ENSG00000000419.12 467 454
## ENSG00000000457.13 279 312
## ENSG00000000460.16 62 91
## ENSG00000000938.12 0 0
Let us plot the raw counts
heatscatter(quants$featureCounts,quants$salmon,
xlab="featureCounts",ylab="salmon",
main="scatterplot (raw counts)")
abline(0,1,lty=2,col="black")
Let us go onto a log scale
heatscatter(log10(quants$featureCounts+1),log10(quants$salmon+1),
xlab="featureCounts",ylab="salmon",
main="scatterplot (log10 counts + 1)")
abline(0,1,lty=2,col="black")
heatscatter(log1p(quants$featureCounts),log1p(quants$salmon),
xlab="featureCounts",ylab="salmon",
main="scatterplot (log counts + 1)")
abline(0,1,lty=2,col="black")
We will use two commonly used tools to achieve the same:
DESeq2 and edgeR
Why? Because you are likely to encounter both in the literature
Differences? Little - mostly taste. DESeq2 abstracts a lot of the decisions from the user, while edgeR leave these all to the user.
Let us take a look at the information we have about our samples to build our model
colData(sg)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut names avgLength
## <character> <factor> <factor> <character> <character> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <character> <character> <character>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
cell and dex are 2 variables where we have different values
Based on the original publication from this data, we know that they are relevant
colData(sg)$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: trt untrt
colData(sg)$cell
## [1] N61311 N61311 N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
Both are factors (categorical variables), but for dex, the treatment comes before the control and this implies that the treatment would be the reference we compare to. It would be easier to think the other way around, so we change the order of the level
colData(sg)$dex <- relevel(colData(sg)$dex, ref = "untrt")
colData(sg)$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: untrt trt
Let us create the DESeq objects
ds_se <- DESeqDataSet(sg, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta
some sanity checking
stopifnot(all(colnames(counts_salmon) == rownames(meta)))
Here is another way to create the same object, relevant if your organism is not among the ones available at Gencode.
meta$dex <- relevel(meta$dex, ref = "untrt")
ds_matrix <- DESeqDataSetFromMatrix(countData = counts_salmon,
colData = meta,
design = ~ cell + dex)
## converting counts to integer mode
Create the object
genetable <- data.frame(gene.id = rownames(counts_salmon),
stringsAsFactors = FALSE)
stopifnot(all(rownames(meta) == colnames(counts_salmon)))
dge <- DGEList(counts = counts_salmon,
samples = meta,
genes = genetable)
names(dge)
## [1] "counts" "samples" "genes"
Here is an illustration of what I meant earlier.
edgeR leaves a lot for the user to do
Here we retrieve the average transcript length, use it to correct the salmon expression values, before calculating the library size factor (i.e. the difference in sequencing depth between samples).
avetxlengths <- assay(sg, "length")
stopifnot(all(rownames(avetxlengths) == rownames(counts_salmon)))
stopifnot(all(colnames(avetxlengths) == colnames(counts_salmon)))
avetxlengths <- avetxlengths/exp(rowMeans(log(avetxlengths)))
offsets <- log(calcNormFactors(counts_salmon/avetxlengths)) +
log(colSums(counts_salmon/avetxlengths))
dge <- scaleOffset(dge, t(t(log(avetxlengths)) + offsets))
names(dge)
## [1] "counts" "samples" "genes" "offset"
dge <- edgeR::calcNormFactors(dge)
## Warning in calcNormFactors.DGEList(dge): object contains offsets, which take precedence over library
## sizes and norm factors (and which will not be recomputed).
dge$samples
## group lib.size norm.factors SampleName cell dex albut
## SRR1039508 1 21071980 1.0560801 GSM1275862 N61311 untrt untrt
## SRR1039509 1 19264691 1.0298325 GSM1275863 N61311 trt untrt
## SRR1039512 1 26088082 0.9857234 GSM1275866 N052611 untrt untrt
## SRR1039513 1 15654069 0.9485718 GSM1275867 N052611 trt untrt
## SRR1039516 1 25210569 1.0297904 GSM1275870 N080611 untrt untrt
## SRR1039517 1 31839813 0.9647851 GSM1275871 N080611 trt untrt
## SRR1039520 1 19643330 1.0345312 GSM1275874 N061011 untrt untrt
## SRR1039521 1 21783926 0.9567274 GSM1275875 N061011 trt untrt
## names avgLength Experiment Sample BioSample
## SRR1039508 SRR1039508 126 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRR1039509 126 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRR1039512 126 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRR1039513 87 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRR1039516 120 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRR1039517 126 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRR1039520 101 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRR1039521 98 SRX384358 SRS508580 SAMN02422677
Let us visualise the library size factor differences
It is surprisingly similar across all samples, a +/- 5% difference (we are using a toy dataset obviously)
boxplot(dge$samples$norm.factors)
Neither DESeq2 nor edgeR modify the count data. They calculate the parameters (such as the library size factor, the dispersion, etc.) to use in their model. If we want to visualise the data in a reduced dimensionality (not 50+ thousands genes and x samples as a matrix), such as a PCA (principal component analysis), or MDS (Multi-dimensional scaling), we need to normalise the data.
In addition to a difference in sequencing depth between samples, RNA-Seq data suffers from another problem, a mean-variance relationship, i.e. the data is heteroskedastic.
meanSdPlot(log(assay(ds_se)[rowSums(assay(ds_se))>0,]))
## Warning: Removed 16307 rows containing non-finite values (stat_binhex).
To counteract this, one can use a variance stabilising transformation (VST), a heuristic that will transform the variance so it becomes independent of the mean
vsd <- DESeq2::vst(ds_se)
## using 'avgTxLength' from assays(dds), correcting for library size
Et voila! Not perfect but good enough for visualisation
meanSdPlot(log(assay(vsd)[rowSums(assay(vsd))>0,]))
Finally the Biological QA ## DESeq2
DESeq2::plotPCA(vsd, intgroup = "cell")
DESeq2::plotPCA(vsd, intgroup = "dex")
plotMDS(dge, top = 500,
labels = NULL,
col = as.numeric(dge$samples$dex),
pch = as.numeric(dge$samples$cell),
cex = 2, gene.selection = "common")
vst <- assay(vsd)
vst <- vst - min(vst)
meanSdPlot(vst[rowSums(vst)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=2
And the number of possible combinations
nlevel=nlevels(ds_se$dex) * nlevels(ds_se$cell)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(ds_se)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=dex,shape=cell,text=SampleName)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Number of genes expressed per condition at different cutoffs
conds <- ds_se$dex
dev.null <- rangeSamplesSummary(counts=vst,
conditions=conds,
nrep=3)
Filter for noise
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3,scale=TRUE)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 9 y values <= 0 omitted from
## logarithmic plot
cutoff.pos <- 2
hm <- heatmap.2(t(scale(t(vst[sels[[cutoff.pos]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="")
Done to assess the previous dendrogram’s reproducibility
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[cutoff.pos]],]))),
method.hclust = "ward.D2",
nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 63 nodes on host 'localhost'
## Multiscale bootstrap... Done.
plot the clustering with bp and au
plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)
Somewhat fancier
tree2 <- full_join(as.treedata(hm.pvclust),
coldata %>% rename(label=names), by='label')
## Joining, by = "label"
ggtree(tree2) +
geom_tippoint(aes(shape=cell)) +
geom_tiplab(size=3,aes(label=dex),hjust=-0.5) +
geom_nodelab(aes(label=au,color=au),hjust=-0.5,size=3) +
scale_color_continuous(low="red", high="blue")
bootstrapping results as a table
print(hm.pvclust, digits=3)
##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 3 0.972 0.986 0.987 0.012 0.007 0.001 -2.214 -0.018 0.969
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] GenomicFeatures_1.42.3 vsn_3.58.0
## [3] tximeta_1.8.4 treeio_1.14.4
## [5] forcats_0.5.1 stringr_1.4.0
## [7] dplyr_1.0.5 purrr_0.3.4
## [9] readr_1.4.0 tidyr_1.1.3
## [11] tibble_3.1.0 tidyverse_1.3.0
## [13] RColorBrewer_1.1-2 pvclust_2.2-0
## [15] plotly_4.9.3 org.Hs.eg.db_3.12.0
## [17] AnnotationDbi_1.52.0 LSD_4.1-0
## [19] hyperSpec_0.99-20201127 xml2_1.3.2
## [21] ggplot2_3.3.3 lattice_0.20-41
## [23] gplots_3.1.1 ggtree_2.4.2
## [25] here_1.0.1 edgeR_3.32.1
## [27] limma_3.46.0 DESeq2_1.30.1
## [29] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [31] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [33] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
## [35] IRanges_2.24.1 S4Vectors_0.28.1
## [37] BiocGenerics_0.36.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1
## [3] AnnotationHub_2.22.0 BiocFileCache_1.14.0
## [5] lazyeval_0.2.2 splines_4.0.5
## [7] crosstalk_1.1.1 BiocParallel_1.24.1
## [9] digest_0.6.27 ensembldb_2.14.0
## [11] htmltools_0.5.1.1 fansi_0.4.2
## [13] magrittr_2.0.1 memoise_2.0.0
## [15] Biostrings_2.58.0 annotate_1.68.0
## [17] modelr_0.1.8 askpass_1.1
## [19] prettyunits_1.1.1 jpeg_0.1-8.1
## [21] colorspace_2.0-0 rappdirs_0.3.3
## [23] blob_1.2.1 rvest_1.0.0
## [25] haven_2.3.1 xfun_0.22
## [27] hexbin_1.28.2 tximport_1.18.0
## [29] crayon_1.4.1 RCurl_1.98-1.3
## [31] jsonlite_1.7.2 genefilter_1.72.1
## [33] survival_3.2-10 ape_5.5
## [35] glue_1.4.2 gtable_0.3.0
## [37] zlibbioc_1.36.0 XVector_0.30.0
## [39] DelayedArray_0.16.3 scales_1.1.1
## [41] DBI_1.1.1 Rcpp_1.0.6
## [43] progress_1.2.2 viridisLite_0.3.0
## [45] xtable_1.8-4 tidytree_0.3.4
## [47] bit_4.0.4 preprocessCore_1.52.1
## [49] htmlwidgets_1.5.3 httr_1.4.2
## [51] ellipsis_0.3.1 farver_2.1.0
## [53] pkgconfig_2.0.3 XML_3.99-0.6
## [55] sass_0.3.1 dbplyr_2.1.1
## [57] locfit_1.5-9.4 utf8_1.2.1
## [59] labeling_0.4.2 later_1.1.0.1
## [61] tidyselect_1.1.0 rlang_0.4.10
## [63] munsell_0.5.0 BiocVersion_3.12.0
## [65] cellranger_1.1.0 tools_4.0.5
## [67] cachem_1.0.4 cli_2.4.0
## [69] generics_0.1.0 RSQLite_2.2.6
## [71] broom_0.7.6 evaluate_0.14
## [73] fastmap_1.1.0 yaml_2.2.1
## [75] knitr_1.31 bit64_4.0.5
## [77] fs_1.5.0 caTools_1.18.2
## [79] AnnotationFilter_1.14.0 nlme_3.1-152
## [81] mime_0.10 aplot_0.0.6
## [83] biomaRt_2.46.3 compiler_4.0.5
## [85] rstudioapi_0.13 interactiveDisplayBase_1.28.0
## [87] curl_4.3 png_0.1-7
## [89] affyio_1.60.0 testthat_3.0.2
## [91] reprex_2.0.0 geneplotter_1.68.0
## [93] bslib_0.2.4 stringi_1.5.3
## [95] highr_0.8 ProtGenerics_1.22.0
## [97] Matrix_1.3-2 vctrs_0.3.7
## [99] pillar_1.5.1 lifecycle_1.0.0
## [101] BiocManager_1.30.12 jquerylib_0.1.3
## [103] data.table_1.14.0 bitops_1.0-6
## [105] rtracklayer_1.50.0 httpuv_1.5.5
## [107] patchwork_1.1.1 affy_1.68.0
## [109] R6_2.5.0 latticeExtra_0.6-29
## [111] promises_1.2.0.1 KernSmooth_2.23-18
## [113] gtools_3.8.2 assertthat_0.2.1
## [115] openssl_1.4.3 rprojroot_2.0.2
## [117] withr_2.4.1 GenomicAlignments_1.26.0
## [119] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
## [121] hms_1.0.0 rmarkdown_2.7
## [123] rvcheck_0.1.8 shiny_1.6.0
## [125] lubridate_1.7.10